# This is the script used to merge the (identifying) experimental data to the NJ voter file 
# It produces the de-identified version of the main dataset used for analysis
# This script is provided as part of the replication package but cannot be run (see readme.txt)
# Many thanks to Gabrielle Peloquin-Skulski for her work on this part of the project
# email Ariel (arwhi@mit.edu) with questions

#####################
# Clear environment #
#####################
rm(list=ls())

#########################
# Set working directory #
#########################
setwd("~/Dropbox (MIT)/NJISJ/Data")

#################
# Load packages #
#################

library(vroom)
library(tidyverse)
library(fastLink)
library(purrr)
library(data.table)
library(stringdist)

#################
# Open Datasets #
#################
exp_dat <- vroom("~/Dropbox (MIT)/NJISJ/Data/voter_file_cleaning/exp_data_clean.csv", delim = ",", col_select = -1) #main (identifying) experimental data
vote_dat <- vroom("~/Dropbox (MIT)/NJISJ/Data/voter_file_cleaning/combined_voter_files.csv", delim = ",", col_select = -1) #May 2022 snapshot
vote_dat_old <- vroom("~/Dropbox (MIT)/NJISJ/Data/voter_file_cleaning/pre_treatment_voter_files.csv", delim = ",", col_select = -1) #August 2021 snapshot
#actually, going to read these in later to avoid having so much stuff in memory
#vote_dat_election <- vroom("~/Dropbox (MIT)/NJISJ/Data/voter_file_cleaning/electionsnapshot_voter_files.csv", delim = ",", col_select = -1) #End of October 2021 snapshot 
#vote_dat_23 <- vroom("~/Dropbox (MIT)/NJISJ/Data/2023_voterfile_cleaning/combined_voter_files.csv", delim = ",", col_select = -1) #March 2023 snapshot

dim(vote_dat)
sum(vote_dat$general_2020); sum(vote_dat$primary_2021); sum(vote_dat$general_2021); sum(vote_dat_23$general_2022)

###one more thing before we introduce merge approaches: set up permuted DOBs for test inspired by Meredith & Morse
exp_dat$voterdobdt <- as.Date(exp_dat$voter_dob) 
exp_dat$bdayplus35 <- exp_dat$voterdobdt + 35 
exp_dat$bdayminus35 <- exp_dat$voterdobdt - 35 

vote_dat$voterdobdt <- as.Date(vote_dat$voter_dob) #line up date format in the other file before merge
vote_dat$bdayplus35 <- vote_dat$voterdobdt #note that this is not actually permuting the DOB on the VF side (that would break the test); just renaming so the names line up for merge

#set up/clean up some name components in case we need them to pare down merge
#looks like both datasets often have just middle initials
#but as Gabby noted in email, occasionally other weird stuff in the admin data like full names in middle field
#cutting down to initial (usually have this) & setting it to missing if middle name is implausibly long
exp_dat$voter_MI <- ifelse(nchar(exp_dat$voter_middle)>12, NA, substr(exp_dat$voter_middle,0,1))
vote_dat$voter_MI <- substr(vote_dat$voter_middle,0,1)
#sort(table(exp_dat$voter_MI)); sort(table(vote_dat$voter_MI)) #of course there are some numbers! hahaha
exp_dat[grepl("[^A-Za-z ]", exp_dat$voter_MI)==T, "voter_MI"] <- NA #swap in NA's for non-letter values
vote_dat[grepl("[^A-Za-z ]", vote_dat$voter_MI)==T, "voter_MI"] <- NA 

### start by matching only on DOB and then use name string distance to pare down
### note this is going to be computationally intensive
loose0 <-  merge(exp_dat, vote_dat, by=c("voter_dob")) #do a very loose merge on DOB
dim(loose0) 

#now pare down these potential matches using lastname and firstname string distance (and maybe middle initials)
loose0$fndist <- stringdist(loose0$voter_first.x, loose0$voter_first.y, method = "jw")
summary(loose0$fndist) #ok so lots of 0s (exact matches) and then some worse stuff

loose0$lndist <- stringdist(loose0$voter_last.x, loose0$voter_last.y, method = "jw") 
summary(loose0$lndist) 

head(loose0[loose0$fndist>0 & loose0$fndist<.15,]) #look at some medium ones
head(loose0[loose0$lndist>0 & loose0$lndist<.15,]) 

loose0b <- loose0[loose0$fndist<.15 & loose0$lndist<.15,]; dim(loose0b); length(unique(loose0b$row_id)) 

#trim a little further by comparing middle names/initials where present
#drop if middle initials are present in both datasets and different (but not if missing)
loose0c <- loose0b[loose0b$voter_MI.x==loose0b$voter_MI.y | is.na(loose0b$voter_MI.x) | is.na(loose0b$voter_MI.y) , ]
dim(loose0c); length(unique(loose0c$row_id))
#this doesn't drop so many people, since only half the exp data has middle initials; still worth it

###DOB-permutation test here (can comment this out after running bc of memory use/runtime)
#permute_loose0 <-  merge(exp_dat, vote_dat, by.x=c("bdayplus35"), by.y=c("voterdobdt")) 
#permute_loose0$fndist <- stringdist(permute_loose0$voter_first.x, permute_loose0$voter_first.y, method = "jw")
#permute_loose0$lndist <- stringdist(permute_loose0$voter_last.x, permute_loose0$voter_last.y, method = "jw")
#permute_loose0b <- permute_loose0[permute_loose0$fndist<.15 & permute_loose0$lndist<.15,]; dim(permute_loose0b); length(unique(permute_loose0b$row_id)) 
#permute_loose0c <- permute_loose0b[permute_loose0b$voter_MI.x==permute_loose0b$voter_MI.y | is.na(permute_loose0b$voter_MI.x) | is.na(permute_loose0b$voter_MI.y) , ]
#dim(permute_loose0c); length(unique(permute_loose0c$row_id))
#length(unique(permute_loose0c$row_id))/nrow(exp_dat) #still a v. low false positive rate here: 20 people match in this version
###ok great, so we pick up some more matches relative to the simplest approach but we don't seem to be pulling in a lot of false positives

#save(loose0c, file="~/Dropbox (MIT)/NJISJ/Data/voter_file_cleaning/tempmerged_mayfile.RData") #write this out so don't need to rerun everything above if session crashes

####################################################
### Use the same merge approach with the old VF copy
rm(loose0, loose0b, vote_dat) #save some memory
vote_dat_old$voter_MI <- substr(vote_dat_old$voter_middle,0,1) #set up middle initials in this dataset
vote_dat_old[grepl("[^A-Za-z ]", vote_dat_old$voter_MI)==T, "voter_MI"] <- NA 
loose0old <-  merge(exp_dat, vote_dat_old, by=c("voter_dob")) 
dim(loose0old) 

#now pare down these potential matches using lastname and firstname string distance (and maybe middle initials)
loose0old$fndist <- stringdist(loose0old$voter_first.x, loose0old$voter_first.y, method = "jw")
summary(loose0old$fndist) 
loose0old$lndist <- stringdist(loose0old$voter_last.x, loose0old$voter_last.y, method = "jw")
loose0oldb <- loose0old[loose0old$fndist<.15 & loose0old$lndist<.15,]; dim(loose0oldb); length(unique(loose0oldb$row_id)) 

loose0oldc <- loose0oldb[loose0oldb$voter_MI.x==loose0oldb$voter_MI.y | is.na(loose0oldb$voter_MI.x) | is.na(loose0oldb$voter_MI.y) , ]
dim(loose0oldc); length(unique(loose0oldc$row_id))

#save(loose0oldc, file="~/Dropbox (MIT)/NJISJ/Data/voter_file_cleaning/tempmerged_oldfile.RData") #write this out so don't need to rerun everything above if session crashes
####################################################
#now, same thing with election-time snapshot (for most accurate registration counts as of election day)
rm(loose0old, loose0oldb) #save some memory before loading more VFs
vote_dat_election <- vroom("~/Dropbox (MIT)/NJISJ/Data/voter_file_cleaning/electionsnapshot_voter_files.csv", delim = ",", col_select = -1) #End of October 2021 snapshot 
vote_dat_23 <- vroom("~/Dropbox (MIT)/NJISJ/Data/2023_voterfile_cleaning/combined_voter_files.csv", delim = ",", col_select = -1) #March 2023 snapshot

vote_dat_election$voter_MI <- substr(vote_dat_election$voter_middle,0,1) 
vote_dat_election[grepl("[^A-Za-z ]", vote_dat_election$voter_MI)==T, "voter_MI"] <- NA 
loose0elec <-  merge(exp_dat, vote_dat_election, by=c("voter_dob")) 
dim(loose0elec) 

loose0elec$fndist <- stringdist(loose0elec$voter_first.x, loose0elec$voter_first.y, method = "jw")
summary(loose0elec$fndist) 
loose0elec$lndist <- stringdist(loose0elec$voter_last.x, loose0elec$voter_last.y, method = "jw") 
loose0elecb <- loose0elec[loose0elec$fndist<.15 & loose0elec$lndist<.15,]; dim(loose0elecb); length(unique(loose0elecb$row_id)) 
rm(loose0elec)
loose0elecc <- loose0elecb[loose0elecb$voter_MI.x==loose0elecb$voter_MI.y | is.na(loose0elecb$voter_MI.x) | is.na(loose0elecb$voter_MI.y) , ]
dim(loose0elecc); length(unique(loose0elecc$row_id))

#save(loose0elecc, file="~/Dropbox (MIT)/NJISJ/Data/voter_file_cleaning/tempmerged_electionfile.RData") #write this out so don't need to rerun everything above if session crashes

####################################################
#now, same thing with post-2022 snapshot (for long-run effects)
rm(loose0elecb) #save some memory
vote_dat_23$voter_MI <- substr(vote_dat_23$voter_middle,0,1) 
vote_dat_23[grepl("[^A-Za-z ]", vote_dat_23$voter_MI)==T, "voter_MI"] <- NA 
loose0longrun <-  merge(exp_dat, vote_dat_23, by=c("voter_dob")) 
dim(loose0longrun) 

loose0longrun$fndist <- stringdist(loose0longrun$voter_first.x, loose0longrun$voter_first.y, method = "jw")
summary(loose0longrun$fndist) 
loose0longrun$lndist <- stringdist(loose0longrun$voter_last.x, loose0longrun$voter_last.y, method = "jw") 
loose0longrunb <- loose0longrun[loose0longrun$fndist<.15 & loose0longrun$lndist<.15,]; dim(loose0longrunb); length(unique(loose0longrunb$row_id)) 
rm(loose0longrun)
loose0longrunc <- loose0longrunb[loose0longrunb$voter_MI.x==loose0longrunb$voter_MI.y | is.na(loose0longrunb$voter_MI.x) | is.na(loose0longrunb$voter_MI.y) , ]
dim(loose0longrunc); length(unique(loose0longrunc$row_id))

#load("~/Dropbox (MIT)/NJISJ/Data/voter_file_cleaning/tempmerged_oldfile.RData") #can load in above merge results if running in chunks
#load("~/Dropbox (MIT)/NJISJ/Data/voter_file_cleaning/tempmerged_mayfile.RData")
#load("~/Dropbox (MIT)/NJISJ/Data/voter_file_cleaning/tempmerged_electionfile.RData")

#output merged dataset with registration/turnout flags from the snapshots
exp_dat$registered <- 0; exp_dat$voted_21 <- 0
#exp_dat[exp_dat$row_id %in% loose0c$row_id, "registered"] <- 1
#table(exp_dat$registered)
#exp_dat$registered <- 0;
exp_dat[exp_dat$row_id %in% loose0elecc$row_id, "registered"] <- 1
table(exp_dat$registered) 

loose0c_voters <- loose0c[loose0c$general_2021==1,]
exp_dat[exp_dat$row_id %in% loose0c_voters$row_id, "voted_21"] <- 1
table(exp_dat$registered); table(exp_dat$voted_21); 

exp_dat$voted_22 <- 0 #add in 2022 turnout (a year later) from later snapshot
loose0longrunc_voters <- loose0longrunc[loose0longrunc$general_2022==1,]
exp_dat[exp_dat$row_id %in% loose0longrunc_voters$row_id, "voted_22"] <- 1
table(exp_dat$voted_22); 

exp_dat$registered22 <- 0; #and also later registration for long-run effects
exp_dat[exp_dat$row_id %in% loose0longrunc$row_id, "registered22"] <- 1
table(exp_dat$registered22) 

exp_dat$registeredpretreat <- 0; #set up a new indicator, this time for pretreat registration
exp_dat[exp_dat$row_id %in% loose0oldc$row_id, "registeredpretreat"] <- 1
table(exp_dat$registeredpretreat)

#now put in flags for previous vote history
general2020votermatches <- loose0oldc[loose0oldc$general_2020==1,]; dim(general2020votermatches)
primary2021votermatches <- loose0oldc[loose0oldc$primary_2021==1,]; dim(primary2021votermatches)
exp_dat$voted2020g_pretreat <- 0; exp_dat$voted2021p_pretreat <- 0; 
exp_dat[exp_dat$row_id %in% general2020votermatches$row_id, "voted2020g_pretreat"] <- 1
table(exp_dat$voted2020g_pretreat)
exp_dat[exp_dat$row_id %in% primary2021votermatches$row_id, "voted2021p_pretreat"] <- 1
table(exp_dat$voted2021p_pretreat)

#now, set up "updated-registration" field (comparing across snapshots as described in paper)
keepcols <- c("voter_id", "voter_registration_date", "voter_first", "voter_last", "street_num", "street_name", "zipcode","voter_party" ) #columns we'll compare across snapshots
changetest <- merge(vote_dat_election[vote_dat_election$voter_id %in% loose0c$voter_id,keepcols], vote_dat_old[vote_dat_old$voter_id %in% loose0c$voter_id, keepcols], by="voter_id", all.x=T) 

changetest$VRDdiff <- ifelse(changetest$voter_registration_date.x == changetest$voter_registration_date.y, 0,1); summary(changetest$VRDdiff) #and try comparing across snapshots too
changetest$fndiff <- ifelse(changetest$voter_first.x==changetest$voter_first.y,0,1) 
changetest$lndiff <- ifelse(changetest$voter_last.x==changetest$voter_last.y,0,1) 
changetest$streetnamediff <- ifelse(changetest$street_name.x==changetest$street_name.y,0,1) 
changetest$partydiff <- ifelse(changetest$voter_party.x==changetest$voter_party.y,0,1) 
summary(changetest$fndiff); summary(changetest$lndiff); summary(changetest$streetnamediff); summary(changetest$partydiff)  

changetest$anychange <- ifelse(is.na(changetest$fndiff)==T | changetest$fndiff==1 | changetest$lndiff==1 | changetest$partydiff==1 | changetest$streetnamediff==1,1,0 ) 
 
regchanges <- changetest[changetest$anychange==1,]; dim(regchanges)
markchanges <- loose0c[loose0c$voter_id %in% regchanges$voter_id,] 

exp_dat$updatedregistration <- 0; 
exp_dat[exp_dat$row_id %in% markchanges$row_id, "updatedregistration"] <- 1
table(exp_dat$updatedregistration)

#and now do the same thing with updated reg but for 2022 endpoint
keepcols <- c("voter_id", "voter_registration_date", "voter_first", "voter_last", "street_num", "street_name", "zipcode","voter_party" ) #columns we'll compare across snapshots
changetest23 <- merge(vote_dat_23[vote_dat_23$voter_id %in% loose0longrunc$voter_id,keepcols], vote_dat_old[vote_dat_old$voter_id %in% loose0longrunc$voter_id, keepcols], by="voter_id", all.x=T) 

changetest23$VRDdiff <- ifelse(changetest23$voter_registration_date.x == changetest23$voter_registration_date.y, 0,1); summary(changetest23$VRDdiff) #and try comparing across snapshots too
changetest23$fndiff <- ifelse(changetest23$voter_first.x==changetest23$voter_first.y,0,1) 
changetest23$lndiff <- ifelse(changetest23$voter_last.x==changetest23$voter_last.y,0,1) 
changetest23$streetnamediff <- ifelse(changetest23$street_name.x==changetest23$street_name.y,0,1) 
changetest23$partydiff <- ifelse(changetest23$voter_party.x==changetest23$voter_party.y,0,1) 
summary(changetest23$fndiff); summary(changetest23$lndiff); summary(changetest23$streetnamediff); summary(changetest23$partydiff)  

changetest23$anychange <- ifelse(is.na(changetest23$fndiff)==T | changetest23$fndiff==1 | changetest23$lndiff==1 | changetest23$partydiff==1 | changetest23$streetnamediff==1,1,0 ) 
 
regchanges23 <- changetest23[changetest23$anychange==1,]; dim(regchanges23)
markchanges23 <- loose0longrunc[loose0longrunc$voter_id %in% regchanges23$voter_id,] 

exp_dat$updatedregistration23 <- 0; 
exp_dat[exp_dat$row_id %in% markchanges23$row_id, "updatedregistration23"] <- 1
table(exp_dat$updatedregistration23)



write.csv(exp_dat, file.path("~/Dropbox (MIT)/NJISJ/Data/voter_file_cleaning/", "matched_data_manualnamedistance_LNfuzzy.csv")) #identifying merged data

### read in the file AI produced by merging treatment list to voter file previously (for race field and also to allow for merge comparison)
AIdata <- read.csv("./AnalystAnalysis/final_analysis_27april2022.csv"); dim(AIdata)
#going to pull the race_model field over from the AI dataset since they ran wru on the list: keep consistent across code versions rather than redoing
table(AIdata$race_model)
dim(unique(AIdata[, c("first_name", "last_name","dob")])); length(unique(AIdata[, c("ai_id")]))
airace <- AIdata[, c("ai_id", "race_model")]
maindata <- merge(exp_dat, airace, by.x=c("ai_id")); dim(maindata) 


### Now, keep only the columns needed for analysis (drop PII)
colnames(maindata)
maindata$ageyears <- round(maindata$age,digits=0) #coarsen age to years so we don't give away exact DOBs
analysiscols <- c("row_id","ageyears", "gender", "race", "race_model","hh_id", "source_race", "source_list","hh_size", "assign_treat","registered", "voted_21", "voted_22", "registered22", "registeredpretreat", "voted2020g_pretreat", "voted2021p_pretreat", "updatedregistration","updatedregistration23")
deid <- maindata[, analysiscols]

write.csv(deid, "~/Dropbox (MIT)/NJISJ/replicationpackage/DATA/NJISJ_fall2021exp_deid.csv")




